import scrapy
from turorial.items import ItcastTeacherItem


class ItcastSpider(scrapy.Spider):
    name = 'itcast'
    allowd_domains = ["http://www.itcast.cn/channel/teacher.shtml"]

    # 需要爬取的网页列表，注意有#的网页可以不用管
    start_urls = ["http://www.itcast.cn/channel/teacher.shtml"]

    def parse(self, response):
        """
        开始对网页进行解析，并将获取的数据保存到本地
        :param response:
        :return:
        """
        # 从网页中获得的老师数据
        teachers = response.xpath('//div[@class="li_txt"]')

        # 创建一个数组来将所有的教师信息保存
        # itcast_teachers = []

        # 开始处理每一页的教师信息
        for teacher in teachers:
            # 创建一个保存老师信息的item实例
            item = ItcastTeacherItem()

            # 必须将解析的内容使用extract方法解析为unicode编码
            name = teacher.xpath('./h3/text()')[0].extract()
            position = teacher.xpath('./h4/text()')[0].extract()
            info = teacher.xpath('./p/text()')[0].extract()

            # 将信息保存到item中
            item["name"] = name
            item["position"] = position
            item["info"] = info

            # 将保存的信息通过yield的方式交给pipeline去处理
            yield item
